# libraries
library(forcats)
library(lubridate)
library(plotly)
library(readr)
library(tidyverse)
# data
scoobydoo <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-07-13/scoobydoo.csv')
scooby doo
This weeks Tidy Tuesday dataset comes from Kaggle by way of manual data aggregation from plummye.
Every Scooby-Doo episode and movie’s various variables.
Took ~1 year to watch every Scooby-Doo iteration and track every variable. Many values are subjective by nature of watching but I tried my hardest to keep the data collection consistent.
If you plan to use this data for anything school/entertainment related you are free to (credit is always welcome).
scoobydoo %>% head(5)
# tidy data
# values
color_scheme1 <- c(
"#228B22",# aka forest green (Best)
"#98FB98", # aka pale green (Top 3)
"#D3D3D3", # aka light grey (Other)
"#FFB6C1", # aka light pink (Bottom 3)
"#DC143C" # aka crimson (Worst)
)
## get summary info for each season
series_info <- scoobydoo %>%
mutate(
year = year(date_aired),
imdb = as.double(imdb),
engagement = as.double(engagement)
) %>%
group_by(series_name, network) %>%
dplyr::summarise(
series_start = min(date_aired),
series_end = max(date_aired),
n_episodes = n(),
mean_imdb = mean(imdb, na.rm = TRUE),
mean_engagement = mean(engagement, na.rm = TRUE)
) %>%
ungroup() %>%
filter(n_episodes > 1) %>% # filter out movie events
arrange(series_start, series_end) %>%
mutate(
series_id = as.double(row_number())
)
series_info <- series_info %>%
mutate(
ranking = case_when(
series_id == head(arrange(series_info, desc(mean_imdb)), 1)$series_id ~ "Best Series",
series_id %in% head(arrange(series_info, desc(mean_imdb)), 3)$series_id ~ "Top 3 Series",
series_id == head(arrange(series_info, mean_imdb), 1)$series_id ~ "Worst Series",
series_id %in% head(arrange(series_info, mean_imdb), 3)$series_id ~ "Bottom 3 Series",
TRUE ~ "Other"
),
ranking = factor(ranking, levels = c("Best Series", "Top 3 Series", "Other", "Bottom 3 Series", "Worst Series"))
)
# plot series over time
series_info %>%
plot_ly(
type = 'bar',
mode = 'markers',
x = ~series_id,
y = ~mean_imdb,
color = ~ranking,
colors = color_scheme1,
text = ~paste0("<b>", series_name, "</b><br>",
"<i>Aired from ", series_start, " to ", series_end, " on ", network, "</i><br><br>",
"Mean IMDb Score: ", round(mean_imdb, 2), " (Number of Reviews: ", round(mean_engagement), ")<br>",
"Episodes: ", n_episodes, "<br>")
) %>%
layout(
title = 'IMDb Scores for Scooby Doo Series Over Time',
xaxis = list(title = 'Sequential Series Number', showticklabels = FALSE),
yaxis = list(title = 'Mean IMDb Score'),
legend = list(orientation = 'h', y = -0.3),
width = 800,
height = 400
)
## get summary info for each season
season_info <- scoobydoo %>%
mutate(
year = year(date_aired),
imdb = as.double(imdb),
engagement = as.double(engagement)
) %>%
group_by(series_name, network, season) %>%
dplyr::summarise(
season_start = min(date_aired),
season_end = max(date_aired),
n_episodes = n(),
mean_imdb = mean(imdb, na.rm = TRUE),
mean_engagement = mean(engagement, na.rm = TRUE)
) %>%
ungroup() %>%
filter(n_episodes > 1 & !(season %in% c("Movie", "Special"))) %>% # filter out movie events
arrange(season_start, season_end) %>%
mutate(
season_id = as.double(row_number())
)
season_info <- season_info %>%
mutate(
ranking = case_when(
season_id == head(arrange(season_info, desc(mean_imdb)), 1)$season_id ~ "Best Season",
season_id %in% head(arrange(season_info, desc(mean_imdb)), 3)$season_id ~ "Top 3 Season",
season_id == head(arrange(season_info, mean_imdb), 1)$season_id ~ "Worst Season",
season_id %in% head(arrange(season_info, mean_imdb), 3)$season_id ~ "Bottom 3 Season",
TRUE ~ "Other"
),
ranking = factor(ranking, levels = c("Best Season", "Top 3 Season", "Other", "Bottom 3 Season", "Worst Season"))
)
The average Scooby Doo TV series has 2.0625 seasons. That’s so few!
# plot season over time
season_info %>%
plot_ly(
type = 'bar',
mode = 'markers',
x = ~season_id,
y = ~mean_imdb,
color = ~ranking,
colors = color_scheme1,
text = ~paste0("<b>", series_name, " - Season ", season, "</b><br>",
"<i>Aired from ", season_start, " to ", season_end, " on ", network, "</i><br><br>",
"Mean IMDb Score: ", round(mean_imdb, 2), " (Number of Reviews: ", round(mean_engagement), ")<br>",
"Episodes: ", n_episodes, "<br>")
) %>%
layout(
title = 'IMDb Scores for Scooby Doo Seasons Over Time',
xaxis = list(title = 'Sequential Season Number', showticklabels = FALSE),
yaxis = list(title = 'Mean IMDb Score'),
legend = list(orientation = 'h', y = -0.3),
width = 800,
height = 400
)
scoobydoo %>%
left_join(series_info, by = c("series_name", "network")) %>%
filter(
!(is.na(imdb)),
imdb != "NULL",
engagement != "NULL",
!(season %in% c("Movie", "Special"))
) %>%
plot_ly(
type = 'scatter',
mode = 'markers',
x = ~index,
y = ~imdb,
color = ~ranking,
colors = color_scheme1,
text = ~paste0("<b>", title, "</b><br>",
"<i>Season ", season, " of Series ", series_name, "</i><br><br>",
"Aired ", date_aired, " on ", network, "<br>",
"IMDb Score ", imdb, " (Number of Reviews: ", engagement, ")")
) %>%
layout(
title = 'IMDb scores of Scooby Doo episodes over time',
xaxis = list(title = 'Episode Index (according to Scoobypedia)'),
yaxis = list(title = 'IMDb Score'),
width = 800,
height = 400,
legend = list(orientation = 'h', y = -0.3)
)
And subtype?
Or Species?
Were not captured?